#!/usr/bin/env python
# coding: utf-8

# ## Mount drive and set current directory

# In[1]:


# Importing the required libraries
import os
import pandas as pd
import numpy as np
import random
os.chdir("..")
print(os.getcwd())

# Set the random seed
random.seed(10012)


# ## Load train and test sets from files

# In[2]:


dataset_name = 'eo_'


# In[3]:


#Reading in the datasets
#We are using the pd.read_csv function to read in the dataset and set which columns to be used as the index of the dataframe
data_full = pd.read_csv("data/raw/" + dataset_name+'clean_full.csv', index_col=0)


# ## Count Vectors and TF-IDF Vectors

# In[4]:


# Importing in the required libraries
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold

stp_wrds = ['a', 'an', 'the', 'of', 'and', 'but', 'or', 'of', 'to']
# We are using custom stop words here as stp_words which we have defined above
pipe = Pipeline([('vect', CountVectorizer(stop_words=stp_wrds, ngram_range=(1, 3))),                  ('selector', VarianceThreshold(threshold=0.028))])
# fit_transform calls both fit() and transform() on the same data. This is used on the training data so that we can scale
# the training data and learn the scaling parameters.
cvec_full = pipe.fit_transform(data_full['text'], data_full['label'])

# This is used for pre-processing before modelling
#cvec_test = pipe.transform(data_test['text'])

cvec_full.shape#, cvec_test.shape


# In[5]:


temp = pd.DataFrame(cvec_full.toarray())
temp['label'] = data_full['label'].tolist()
temp.to_csv("data/output/" + dataset_name+'cvec_full.csv')


# In[6]:


stp_wrds = ['a', 'an', 'the', 'of', 'and', 'but', 'or', 'of', 'to']
pipe = Pipeline([('vect', TfidfVectorizer(stop_words=stp_wrds, ngram_range=(1, 3))),                  ('selector', VarianceThreshold(threshold=0.00001))])
tfidf_full = pipe.fit_transform(data_full['text'], data_full['label'])
tfidf_full.shape


# In[7]:


temp = pd.DataFrame(tfidf_full.toarray())
temp['label'] = data_full['label'].tolist()
temp.to_csv("data/output/"+dataset_name+'tfidf_full.csv')


# In[8]:


del temp


# ## Sentence/Doc-Level Embeddings

# In[9]:


#from bert_embedding import BertEmbedding
#bert_embedding = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased')


# ### BERT-based

# In[10]:


from sentence_transformers import SentenceTransformer
'''
roberta-base-nli-stsb-mean-tokens
bert-base-nli-stsb-mean-tokens
distilroberta-base-paraphrase-v1
'''


# In[11]:


sbert_model = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')
sentence_embeddings = sbert_model.encode(data_full['text'].tolist())
sentence_embeddings.shape


# In[12]:


pd.DataFrame(sentence_embeddings).to_csv("data/output/"+dataset_name+"roberta_full.csv")


# In[13]:


sbert_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')
sentence_embeddings = sbert_model.encode(data_full['text'].tolist())
sentence_embeddings.shape


# In[14]:


pd.DataFrame(sentence_embeddings).to_csv("data/output/"+dataset_name+"bert_full.csv")


# In[15]:


sbert_model = SentenceTransformer('distilroberta-base-paraphrase-v1')
sentence_embeddings = sbert_model.encode(data_full['text'].tolist())
sentence_embeddings.shape


# In[16]:


pd.DataFrame(sentence_embeddings).to_csv("data/output/"+dataset_name+"distil_full.csv")


# ### GloVe Embeddings

# In[17]:


"""
sbert_model = SentenceTransformer('average_word_embeddings_glove.840B.300d')
sentence_embeddings = sbert_model.encode(data_train['text'].tolist())
sentence_embeddings.shape

pd.DataFrame(sentence_embeddings).to_csv(dataset_name+"glove840B_train.csv")
"""


# In[18]:


sbert_model = SentenceTransformer('average_word_embeddings_glove.6B.300d')
sentence_embeddings = sbert_model.encode(data_full['text'].tolist())
sentence_embeddings.shape


# In[19]:


pd.DataFrame(sentence_embeddings).to_csv("data/output/"+dataset_name+"glove6B_full.csv")


# ### Universal Sentence Encoder

# In[20]:


import tensorflow as tf
import tensorflow_hub as hub
import numpy as np


# In[21]:


module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = hub.load(module_url)
print ("module %s loaded" % module_url)


# In[22]:


sentence_list = data_full['text'].tolist()
len(sentence_list)


# In[23]:


sentence_embeddings = []
for i in range(len(sentence_list)):
  sentence_embeddings.append(np.array(model([sentence_list[i]])[0]))
np.array(sentence_embeddings).shape


# In[24]:


pd.DataFrame(np.array(sentence_embeddings)).to_csv("data/output/"+dataset_name+"universal_full.csv")


# ## Dimension Reduction

# In[25]:


cvec_full = pd.read_csv("data/output/"+dataset_name+'cvec_full.csv', index_col=0)
tfidf_full =  pd.read_csv("data/output/"+dataset_name+'tfidf_full.csv', index_col=0)


# ### PCA, UMAP, NMF

# In[26]:


from sklearn.decomposition import PCA
pd.DataFrame(PCA(n_components=16).fit_transform(cvec_full.drop(columns=['label']))).to_csv("data/output/"+dataset_name+'cvec_pca16_full.csv')
pd.DataFrame(PCA(n_components=16).fit_transform(tfidf_full.drop(columns=['label']))).to_csv("data/output/"+dataset_name+'tfidf_pca16_full.csv')


# In[27]:


import umap.umap_ as umap
pd.DataFrame(umap.UMAP(n_components=16).fit_transform(cvec_full.drop(columns=['label']))).to_csv("data/output/"+dataset_name+'cvec_umap16_full.csv')
pd.DataFrame(umap.UMAP(n_components=16).fit_transform(tfidf_full.drop(columns=['label']))).to_csv("data/output/"+dataset_name+'tfidf_umap16_full.csv')


# In[28]:


from sklearn.decomposition import NMF
pd.DataFrame(NMF(n_components=16, init='nndsvd').fit_transform(cvec_full.drop(columns=['label']))).to_csv("data/output/"+dataset_name+'cvec_nmf16_full.csv')
pd.DataFrame(NMF(n_components=16, init='nndsvd').fit_transform(tfidf_full.drop(columns=['label']))).to_csv("data/output/"+dataset_name+'tfidf_nmf16_full.csv')


# In[29]:


from sklearn.manifold import TSNE
pd.DataFrame(TSNE(n_components=2, init='random').fit_transform(cvec_full.drop(columns=['label']))).to_csv("data/output/"+dataset_name+'cvec_tsne16_full.csv')
pd.DataFrame(TSNE(n_components=2, init='random').fit_transform(tfidf_full.drop(columns=['label']))).to_csv("data/output/"+dataset_name+'tfidf_tsne16_full.csv')


# * BERT
# * DistilBERT
# * RoBERTa
# * Universal Sentence Encoder
# * Glove6B
# * pca16-cvec
# * pca16-tfidf
# * umap16-cvec
# * umap16-tfidf
# * nmf16-cvec
# * nmf16-tfidf
# * tsne16-cvec
# * tsne16-tfidf

# ## Topic Modeling with LDA (Taddy)

# In[30]:


from sklearn.decomposition import LatentDirichletAllocation
pd.DataFrame(LatentDirichletAllocation(n_components=100, random_state=0).fit_transform(cvec_full.drop(columns=['label']))).to_csv("data/output/"+dataset_name+'lda100_full.csv')

